% scribe: Saurabh Amin
% lastupdate: Nov. 16, 2005
% lecture: 19
% references: Durrett, sections 4.1
% title: Conditional Probability and Conditional Expectations
% keywords: Conditional expectation: formal definition, elementary motivations, existence of conditional expectation in simple cases, general proofs of existence of conditional expectation, uniqueness of conditional expectation. 
% end

\documentclass[12pt,letterpaper]{article}

\include{macros}

\begin{document}

\lecture{19}{Conditional Probability \& Expectation}{Saurabh Amin}
{amins@berkeley.edu} 

This set of notes is a revision of the work of Charles C.\ Fowlkes.  
Reference: \cite{durrett}, section 4.1.

\section{Definition of Conditional Expectation}
% keywords: Conditional expectation: formal definition
% end

We present the definition of conditional expectation due to Kolmogorov (1933).

\begin{definition}
Given the probability space $(\Omega, \F, \P)$, some sub-sigma field $\G \subset \F$, and a random variable $X\in \L^{1}(\F)$ (meaning that $X$ is $\F$-measurable and $\E|X|<\infty$), the conditional expectation of $X$ given $\G$ is the (almost surely unique) random variable $\hat{X}$ such that

\begin{itemize}
\item[i.]{$\hat{X}\in L^{1}(\G)$ that is, $\hat{X}$ is
    $\G$-measurable; and}
\item[ii.]{$\E(\hat{X}\1_{G}) = \E(X\1_{G})$ for all $G\in\G$: that is, $\hat{X}$ integrates like $X$ over all $\G$-sets.} 
\end{itemize}
\label{defn}
\end{definition}

Recall that $\E(\hat{X}\1_{G})=\int_{G}Xd\P$. The random variable $\hat{X}$ is denoted by $\E(X |\G)$. We now discuss the motivation behind this definition by elementary considerations. Recall the `undergraduate' definition of conditional
probability given by Bayes' Rule \[ \P(A | B) \equiv
\frac{\P(A,B)}{\P(B)} \] for $\P(B)>0$. Now if $G_1, G_2, \ldots$ is a partition of $\Omega$ into measurable sets, then 
\begin{equation}
\P(A) = \sum_i \P(A \cap G_i)=\sum_i \P(A|G_i)\P(G_i)
\label{totalprob}
\end{equation} 
Recall that this is a form of the law of total probability. $\P(\cdot|B)$ is a new probability measure on $\Omega$ which concentrates on $B$. We can then generalize naturally to conditional expectations because $\P(d\omega|B)$ can be used to integrate as any $\P(d\omega)$. Using this fact, Bayes' rule, and the law of total probability, we have  
\begin{eqnarray*} \E(X|B) &=&
\int_\Omega X(\omega) \P(dw | B) \\ &=& \frac{\int
X(\omega)\1(\omega\in B) \P(dw)}{\P(B)} \\ &=& \frac{\E(X
\1_{B})}{\P(B)} \end{eqnarray*}
Note that equation (\ref{totalprob}) is obtained by multiplying the identity $\1=\sum_{i}\1_{G_i}$ on both sides by $\1_A$ and taking expectations. This can easily be generalized to the following by multiplying the identity on both sides by $X$ and taking expectations.
\begin{equation}
\E(X) = \sum_i \E(X \1_{G_i})=\sum_i \E(X|G_i)\P(G_i)
\label{texp}
\end{equation} 
This will be true provided $\E|X|<\infty$. A variation of equation (\ref{texp}) can be obtained as follows. Let $G$ be any union of the $G_i$, i.e., $G\in\G$ where $\G=\sigma(G_i,i=1,2,,\ldots)$. Again, multiplying the identity $\1_{G}=\sum_{i:G_i\subset G}\1_{G_i}$ on both sides by $X$ and taking expectations we obtain the following:
\begin{eqnarray} \E(X\1_{G}) &=&
\sum_{i:G_i\subset G} \E(X\1_{G_i})\nonumber\\ &=& \sum_{i:G_i\subset G}\E(X|G_i)\P(G_i) 
\label{keyobs}
\end{eqnarray}
The R.H.S.\ of equation (\ref{keyobs}) can interpreted as the expectation of a random variable $\hat{X}=\sum_{i:G_{i}\subset G}\E(X|G_i)\1_{G_i}$, that is, $\hat{X}$ takes the value $\E(X|G_i)$ if $G_i$ occurs. We have just shown that $\E(X\1_G)=\E(\hat{X}\1_{G})$ for every $G$ which is a union of the $G_i$s' (refer to condition \textit{(ii)} of definition $\ref{defn}$). Obviously, since $\hat{X}$ is measurable w.r.t.\ $\G$, condition \textit{(i)} of definition $\ref{defn}$ is satisfied. So we have just constructed $\hat{X}=\E(X|\G)$ explicitly in the case when $\G=\sigma(G_1,G_2,\ldots)$.

\section{Existence and Uniqueness of Conditional Expectation}
\begin{proposition} 
$\E(X|\G)$ is unique up to almost sure equivalence.  
\end{proposition} 

\begin{proofsketch}
Suppose that two random variables $\hat{X}_1$ and $\hat{X}_2$ are candidates for the conditional expectation $\E(X|\G)$. Let $Y:=\hat{X}_1-\hat{X}_2$. So we have $Y\in L^{1}(\G)$ and $\E(Y\1_G)=0$ $\forall G\in\G$. In particular, choose $G=\{Y>\epsilon\}$ and so we have $\E(Y\1(Y>\epsilon))=0$. By Markov's inequality, $\P(Y>\epsilon)\leq\E(Y\1(Y>\epsilon))/\epsilon=0$. Interchanging the roles of $X_1$ and $X_2$, we have $\P(Y<-\epsilon)=0$. And since $\epsilon$ is arbitrary, $\P(Y=0)=1$.
\end{proofsketch}

\begin{proposition} 
$\E(X|\G)$ exists. 
\end{proposition}
We give three different approaches for dealing with the general case.

\subsection{Measure Theory Proof}
Here we pull out some power tools from measure theory.
\begin{theorem}[Lebesgue-Radon-Nikodym]
(see \cite{durrett}, p.\~477) If $\mu$ and $\lambda$ are
non-negative $\sigma$-finite measures on a collection $\G$
and $\mu(G) = 0 \implies \lambda(G) = 0$ (written $\lambda
<\!\!< \mu$, pronounced "$\lambda$ is absolutely continuous
with respect to $\mu$") for all $G \in \G$ then there exists
a non-negative $\G$-measurable function ${\hat Y}$ such that
\[ \lambda (G) = \int_G {\hat Y} d\mu \] for all $G \in
{\cal G}$. If $\hat{X}$ is another such function then $\hat{X}=\hat{Y}$ $\mu$ a.e.
\end{theorem}
\begin{proofsketch} (existence via Lebesgue-Radon-Nikodym)
Assume $Y \geq 0$ and define the measure \[ Q(C)
= \int_C Y dP = \E Y \1_C \] which is non-negative and
finite because $\E |Y| < \infty$.  Note that $Q$ is absolutely
continuous with respect to $P$.  LRN implies the
existence of ${\hat Y}$ which satisfies our requirements to
be a version of the conditional expectation ${\hat Y} = \E
(Y | \G)$.  For general $Y$ we can employ $\E(Y^+ | \G) -
\E(Y^- | \G)$.
\end{proofsketch}

\subsection{Hilbert Space Method}

This gives a nice geometric picture for the case when
$Y \in \L^2$

\begin{lemma}\label{lemma:hilbunique} Every nonempty, closed, convex set E in a
Hilbert space H contains a unique element of smallest norm.
\end{lemma}

\begin{lemma}[Existence of Projections in Hilbert Space]
Given a closed subspace $K$ of a Hilbert space $H$ 
and element $x \in H$, there exists a decomposition 
$x = y + z$ where $y \in K$ and $z \in K^\perp$ (the
orthogonal complement).
\end{lemma}

The idea for the existence of projections is to let $y$ be the element of
smallest norm in $x+K$ and $z = x-y$.  See Rudin 87 (p.\~79) for a full
discussion of Lemma \ref{lemma:hilbunique}.

\begin{proofsketch} (Existence via Hilbert Space Projection) 
\label{proof:hilbert} Suppose $Y \in \L^2(\F)$
and $X \in \L^2(\G)$.  Requirement \textit{(ii)} demands that for all
$X$ \[ \E( (Y - \E(Y|\G)) X ) = 0 \] which has the geometric
interpretation of requiring $Y - \E(Y|\G)$ to be orthogonal
to the subspace $\L^2(\G)$.  Requirement \textit{(i)} says that
$\E(Y|\G) \in \L^2(\G)$ so $\E(Y|\G)$ is just the orthogonal
projection of $Y$ onto the closed subspace $\L^2(\G)$.  The
lemma above shows that such a projection is well defined.
\end{proofsketch}

\subsection{``Hands On'' Proof}
The first is a hands on approach by extending the discrete
case via limits.  We will make use of:

\begin{lemma}[William's Tower Property]
Suppose $\G
\subset H \subset F$ are nested $\sigma$-fields and $\E(
\cdot | \G)$ and $\E(\cdot | \H)$ are both well defined.  Then
$\E(\E(Y|\H)|\G) = \E(Y|\G) = \E(\E(Y|\G)|\H)$.
\end{lemma}

A special case is when $\G = \{\emptyset,\Omega\}$ then
$\E(Y|\G) = \E Y$ is a constant so it's easy to see
$\E(\E(Y|\H)|\G) = \E(\E(Y)|\H) = \E(Y)$ and
$\E(\E(Y|\G)|\H) = \E(\E(Y)|\H) = \E(Y)$.

\begin{proofsketch} (Existence via Limits)
 For a disjoint partition $\sqcup G_i =
\Omega$ and $G \in \G = \sigma(\{G_i\})$ we have shown that \[ E(Y|\G) =
\sum_i \frac{E(Y\1_{G_i})}{P(G_i)}1_{G_i} \] where we deal
appropriately with the niggling possibility of $\P(G_i) = 0$
by either throwing out the offending sets or defining
$\frac{0}{0} = 0$. 

We now consider an arbitrary but countably generated
$\sigma$-field $\G$.  This situation is not too restrictive,
for example the $\sigma$-field associated with an
$\R$-valued random variable $X$ is generated by the
countable collection $\{ B_i = (X \leq r_i) : r \in \Q\}$.
If we set $\G_n = \sigma(B_1,B_2,\ldots,B_n)$ then $\G_n$ is
increasing to the limit $\G_1 \subset \G_2 \subset \ldots
\subset \G = \sigma(\cup \G_n)$.  For a given $n$ the random
variable $Y_n = \E(Y|\G_n)$ exists by our explicit
definition above since we can decompose the generating set
into a disjoint partition of the space.

Now we show that $Y_n$ converges in some appropriate manner
to a $Y_\infty$  which will then serve as a version of
$\E(Y|\G)$.  We will assume that $\E|Y|^2 < \infty$

Write $Y_n = \E(Y|\G_n) = Y_1 + (Y_2 - Y_1) + (Y_3 - Y_2) +
\ldots + (Y_n - Y_{n-1})$.  The terms in this summation are
orthogonal in $\L^2$ so we can compute the variance as \[
s_n^2 = \E(Y_n^2) = \E (Y_1^2) + \E((Y_2-Y_1)^2) \ldots + \E
((Y_n-Y_{n-1})^2) \] where the cross terms are zero.  Let
$s^2 = \E(Y^2) = \E(Y_n + (Y-Y_n)) < \infty$.  Then $s_n^2
\uparrow s_\infty^2 \leq s^2 < \infty$.  For $n>m$ we know
again by orthogonality that $\E((Y_n - Y_m)^2) = s_n^2 -
s_m^2 \to 0$ as $m \to \infty$ since $s_n^2$ is just a
bounded real sequence.  This means that the sequence $Y_n$
is Cauchy in $\L^2$ and invoking the completeness of $\L^2$ we
conclude that $Y_n \to Y_\infty$.

All that remains is to check that $Y_\infty$ is a
conditional expectation.  It satisfies requirement \textit{(i)} since
as a limit of $\G$-measurable variables it is $\G$-measurable.  
To check \textit{(ii)} we need to show that $\E(Y G) = \E(Y_\infty G)$ for
all $G$ which are bounded and $\G$-measurable.  As usual, it
suffices to check for a much smaller set $\{\1_{A_i} : A_i
\in {\cal A}\}$ where ${\cal A}$ is an intersection closed
collection and $\sigma({\cal A}) = \G$.  Take this
collection to be ${\cal A} = \cup_m \G_m$.  \[ \E(Y G_m) =
\E(Y_m G_m) = \E(Y_n G_m) \] holds by the tower property for
any $n > m$.  Noting that $\E(Y_n Z) \to \E(Y_\infty Z)$ is
true for all $Z \in \L^2$ by the continuity of the inner product,
this sequence must go to the desired limit which gives $\E(Y
\G_m) = \E(Y_\infty \G_m)$.
\end{proofsketch} 
\begin{exercise}
Remove the countably generated constraint on $\G$.  (\emph{Hint:} 
Be a bit more clever: for $Y \in \L^2$ look at 
$\E(Y|\G)$ for $\G \subset \F$ with $\G$ finite.  Then as
above $\sup_\G \E(\E(Y|\G)^2) \leq \E Y^2$ so we can choose
$\G_n$ with $\E(\E(Y|\G_n)^2)$ increasing to this supremum.
The $\G_n$ may not be nested but argue that ${\cal C}_n =
\sigma(\G_1 \cup \G_2 \cup \ldots \cup \G_n)$ are and let
$\hat Y = \lim_n \E(Y|{\cal C}_n))$).
\end{exercise}

\begin{exercise} Remove the $\L^2$ constraint on $Y$.  (\emph{Hint:} 
Consider $Y \geq 0$ and show convergence of
$\E(Y \wedge n\ |\ \G)$, then turn crank on the standard machinery.)
\end{exercise}

\bibliographystyle{plain}
\bibliography{../books.bib}

\end{document}